#############################################3 virulence test on walnut rootstock vlach; the experiment was repeated once, each time 3 plants was inoculated for each strain.

walnut_tumor <- read.table("walnut_tumor.csv", header = TRUE, sep = ",")
str(walnut_tumor)
## 'data.frame':    186 obs. of  2 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ weight : num  0.677 0.995 0.495 0 0 ...
ggplot(walnut_tumor, aes(x=strains, y=weight, color=strains)) +
  geom_boxplot() +
  geom_jitter() +
  theme(axis.text.x=element_text(hjust=0.5,vjust=0.5,angle=90)) 

  #ylim(0,20)
ggsave("/Users/dklabuser/limin/Dissertation_papers/dissertation/chapter3_phenoTraits/tumor_walnut.png", width = 20, height = 20, units = "in")

statistic analysis of tumor weight from walnut

walnut <- read.table("walnut_tumor.csv", header = TRUE, sep = ",")

# remove 3 non-virulent strains,and mock
walnut_cleand <- walnut %>% filter(!(strains %in% c("But002","SJ003","Tul002","Mock")))

# model for final data analysis
m2 <- lm(weight ~ strains, walnut_cleand) 
qqp(residuals(m2))

## [1] 118 136
plotNormalHistogram(residuals(m2))

# log transformation to m2; 普通/一般线性模型
m2_trs <- lm(log(weight+0.5) ~ strains, walnut_cleand) 

# or with following codes to check residuals
qqp(residuals(m2_trs))

## [1] 118  51
plotNormalHistogram(residuals(m2_trs)) 

anova(m2_trs)
## Analysis of Variance Table
## 
## Response: log(weight + 0.5)
##            Df  Sum Sq Mean Sq F value   Pr(>F)   
## strains    26  46.615 1.79289  2.0666 0.004045 **
## Residuals 135 117.119 0.86755                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# categorical data use this method.
means <- emmeans(m2_trs, specs = "strains")
cld(means, alpha = 0.05, Letters = "abcdefghijklmn") # multiple comparison, tuky-test
##  strains  emmean   SE  df lower.CL upper.CL .group
##  Kin002  -0.2176 0.38 135  -0.9696    0.534  a    
##  CL001    0.0119 0.38 135  -0.7401    0.764  ab   
##  C58      0.4655 0.38 135  -0.2865    1.217  ab   
##  Kin001   0.5280 0.38 135  -0.2240    1.280  ab   
##  Tul001   0.6752 0.38 135  -0.0768    1.427  ab   
##  Yol002   0.7271 0.38 135  -0.0249    1.479  ab   
##  CC001    0.7659 0.38 135   0.0138    1.518  ab   
##  Kin003   0.8492 0.38 135   0.0971    1.601  ab   
##  Gle001   0.8511 0.38 135   0.0990    1.603  ab   
##  But001   1.0391 0.38 135   0.2871    1.791  ab   
##  Sta004   1.0671 0.38 135   0.3151    1.819  ab   
##  Yub001   1.0740 0.38 135   0.3220    1.826  ab   
##  Gle002   1.0822 0.38 135   0.3302    1.834  ab   
##  Sta001   1.1637 0.38 135   0.4117    1.916  ab   
##  Yub002   1.3169 0.38 135   0.5649    2.069  ab   
##  Sut002   1.4098 0.38 135   0.6578    2.162  ab   
##  Tul003   1.4188 0.38 135   0.6668    2.171  ab   
##  Sta003   1.4622 0.38 135   0.7102    2.214  ab   
##  Tul004   1.4840 0.38 135   0.7319    2.236  ab   
##  Sta005   1.4941 0.38 135   0.7421    2.246  ab   
##  SJ002    1.5537 0.38 135   0.8017    2.306  ab   
##  SJ001    1.5670 0.38 135   0.8150    2.319  ab   
##  Teh002   1.5805 0.38 135   0.8285    2.333  ab   
##  Yol001   1.6059 0.38 135   0.8539    2.358  ab   
##  Sta002   1.8615 0.38 135   1.1095    2.614   b   
##  Sut001   1.9181 0.38 135   1.1661    2.670   b   
##  Teh001   1.9682 0.38 135   1.2162    2.720   b   
## 
## Results are given on the log(mu + 0.5) (not the response) scale. 
## Confidence level used: 0.95 
## Results are given on the log (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 27 estimates 
## significance level used: alpha = 0.05

statistic analysis of tumor weight from walnut virulenc test on Datura plants, the experiment was repeated in 3 blocks(3 benches). On each bench there were 5 plants inoculated for each strain. In total, each strain was inoculated to 15 datura plants

datura <- read.table("datura_tumor.csv", header = TRUE, sep = ",")

# remove 3 non-virulent strains,and mock
datura_cleand <- datura %>% filter(!(strains %in% c("But002","SJ003","Tul002","Mock")))

# mixed model for final data analysis
m2 <- lmer(weight ~ strains + block + (1|line), datura_cleand) 
qqp(residuals(m2))

## [1] 333 240
plotNormalHistogram(residuals(m2))

# Do log transformation to m2  
m2_trs <- lmer(log(weight+0.5) ~ strains + block + (1|line), datura_cleand) 

# check residuals 
qqp(residuals(m2_trs))

## [1] 240 336
plotNormalHistogram(residuals(m2_trs)) 

Anova(m2_trs)
## Analysis of Deviance Table (Type II Wald chisquare tests)
## 
## Response: log(weight + 0.5)
##            Chisq Df Pr(>Chisq)    
## strains 106.2916 26  1.129e-11 ***
## block     0.0252  1     0.8739    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
means <- emmeans(m2_trs, specs = "strains")
cld(means, alpha = 0.05, Letters = "abcdefghijklmn") # multiple comparison
##  strains emmean    SE    df lower.CL upper.CL .group
##  Kin002  -0.395 0.205  75.5  -0.8038   0.0146  a    
##  Teh001   0.272 0.203  91.9  -0.1305   0.6746  ab   
##  C58      0.426 0.183 159.5   0.0642   0.7873  abc  
##  Gle001   0.433 0.185 137.2   0.0670   0.7990  abc  
##  Yub002   0.444 0.187 114.7   0.0735   0.8155  abc  
##  Teh002   0.488 0.179 212.1   0.1348   0.8415  abc  
##  Kin001   0.495 0.183 158.5   0.1335   0.8563  abc  
##  Kin003   0.511 0.183 162.1   0.1495   0.8722  abc  
##  Sut002   0.542 0.185 130.3   0.1754   0.9088  abc  
##  Tul004   0.727 0.188 108.6   0.3550   1.0987   bc  
##  Sut001   0.744 0.183 165.6   0.3832   1.1049   bc  
##  CC001    0.785 0.185 138.5   0.4189   1.1508   bc  
##  Sta002   0.823 0.180 200.1   0.4669   1.1786   bc  
##  SJ001    0.830 0.183 169.9   0.4696   1.1908   bc  
##  Gle002   0.848 0.183 159.3   0.4863   1.2093   bc  
##  CL001    0.861 0.203  84.9   0.4570   1.2657   bc  
##  Tul001   0.898 0.186 116.7   0.5296   1.2671   bc  
##  But001   0.899 0.183 165.9   0.5378   1.2596   bc  
##  Sta004   0.927 0.183 164.9   0.5660   1.2877   bc  
##  SJ002    0.965 0.181 191.8   0.6088   1.3221   bc  
##  Yol002   0.997 0.183 167.6   0.6360   1.3578   bc  
##  Sta003   1.029 0.185 142.6   0.6631   1.3941   bc  
##  Sta001   1.046 0.183 160.7   0.6846   1.4071   bc  
##  Yub001   1.065 0.183 162.7   0.7038   1.4262   bc  
##  Sta005   1.245 0.183 169.6   0.8846   1.6058   bc  
##  Tul003   1.323 0.186 124.1   0.9551   1.6901    c  
##  Yol001   1.343 0.187 116.7   0.9724   1.7134    c  
## 
## Degrees-of-freedom method: kenward-roger 
## Results are given on the log(mu + 0.5) (not the response) scale. 
## Confidence level used: 0.95 
## Results are given on the log (not the response) scale. 
## P value adjustment: tukey method for comparing a family of 27 estimates 
## significance level used: alpha = 0.05

8 Antibiotic resistance test Two antibiotics S10 and VA30 are very different from others, 29 out of 30 strains are resistant to them, giving 0 diameter of inhibition zone.

  1. carbenicillin resistance
cb <- read.csv("carbenicillin.csv", header = TRUE, sep = ",")
cb <- cb %>% filter(strains != "Yub001") # remove data points which are 0. 

# create a linear model
cbM <- lm(cb100_D ~ strains, cb)

# do diagnostic of data variance
par(mfrow = c(2,2)); plot(cbM); par(mfrow = c(1,1))
## hat values (leverages) are all = 0.3333333
##  and there are no factor predictors; no plot no. 5

anova(cbM)
## Analysis of Variance Table
## 
## Response: cb100_D
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## strains   28 11.5095 0.41105  28.997 < 2.2e-16 ***
## Residuals 58  0.8222 0.01418                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cb_mean <- emmeans(cbM, specs = "strains")

cld(cb_mean, alpha = 0.05, Letters = "abcdefghijklmn")
##  strains emmean     SE df lower.CL upper.CL .group   
##  Tul002    1.67 0.0687 58     1.53     1.80  a       
##  Kin002    1.70 0.0687 58     1.56     1.84  ab      
##  Sta001    1.87 0.0687 58     1.73     2.00  abc     
##  SJ003     1.88 0.0687 58     1.75     2.02  abc     
##  SJ002     1.94 0.0687 58     1.80     2.08  abcd    
##  Kin003    1.97 0.0687 58     1.83     2.10  abcd    
##  Sut002    1.97 0.0687 58     1.83     2.10  abcd    
##  Gle001    2.00 0.0687 58     1.86     2.14  abcde   
##  Tul004    2.00 0.0687 58     1.86     2.14  abcde   
##  Kin001    2.00 0.0687 58     1.86     2.14  abcde   
##  SJ001     2.03 0.0687 58     1.89     2.16  abcde   
##  But001    2.03 0.0687 58     1.90     2.17  abcde   
##  Sut001    2.03 0.0687 58     1.90     2.17  abcde   
##  Tul003    2.07 0.0687 58     1.93     2.20   bcde   
##  CC001     2.10 0.0687 58     1.96     2.24    cde   
##  Yub002    2.13 0.0687 58     2.00     2.27    cde   
##  Gle002    2.17 0.0687 58     2.03     2.30    cdef  
##  CL001     2.17 0.0687 58     2.03     2.30    cdef  
##  Sta002    2.20 0.0687 58     2.06     2.34    cdef  
##  Sta004    2.23 0.0687 58     2.10     2.37    cdef  
##  Yol001    2.23 0.0687 58     2.10     2.37    cdef  
##  Teh002    2.27 0.0687 58     2.13     2.40     def  
##  Tul001    2.27 0.0687 58     2.13     2.40     def  
##  Yol002    2.27 0.0687 58     2.13     2.40     def  
##  Teh001    2.37 0.0687 58     2.23     2.50      efg 
##  But002    2.53 0.0687 58     2.40     2.67       fg 
##  Sta003    2.67 0.0687 58     2.53     2.80        g 
##  Sta005    3.13 0.0687 58     3.00     3.27         h
##  C58       3.40 0.0687 58     3.26     3.54         h
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 29 estimates 
## significance level used: alpha = 0.05
  1. chloramphenicol
chl <- read.csv("chloramphenicol.csv", header = TRUE, sep = ",")
chl <- chl %>% filter(strains != "Kin002")
str(chl)
## 'data.frame':    87 obs. of  2 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ c30_D  : num  2.4 2.5 2.5 0.8 1 1 1 1 0.9 1.1 ...
# create a linear model
chlM <- lm(c30_D ~ strains, chl)

# do diagnostic of data variance
par(mfrow = c(2,2)); plot(chlM); par(mfrow = c(1,1))
## hat values (leverages) are all = 0.3333333
##  and there are no factor predictors; no plot no. 5

anova(chlM)
## Analysis of Variance Table
## 
## Response: c30_D
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## strains   28 14.0940 0.50336   62.56 < 2.2e-16 ***
## Residuals 58  0.4667 0.00805                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
chl_mean <- emmeans(chlM, specs = "strains")

cld(chl_mean, alpha = 0.05, Letters = "abcdefghijklmn")
##  strains emmean     SE df lower.CL upper.CL .group     
##  Kin003   0.700 0.0518 58    0.596    0.804  a         
##  Tul003   0.733 0.0518 58    0.630    0.837  ab        
##  Tul004   0.767 0.0518 58    0.663    0.870  abc       
##  Teh001   0.800 0.0518 58    0.696    0.904  abcd      
##  Sta003   0.833 0.0518 58    0.730    0.937  abcde     
##  SJ003    0.867 0.0518 58    0.763    0.970  abcdef    
##  But002   0.933 0.0518 58    0.830    1.037  abcdefg   
##  Tul001   0.933 0.0518 58    0.830    1.037  abcdefg   
##  Teh002   0.933 0.0518 58    0.830    1.037  abcdefg   
##  Sut002   0.933 0.0518 58    0.830    1.037  abcdefg   
##  Sta005   0.967 0.0518 58    0.863    1.070  abcdefg   
##  CC001    0.967 0.0518 58    0.863    1.070  abcdefg   
##  Sut001   0.967 0.0518 58    0.863    1.070  abcdefg   
##  Gle002   1.000 0.0518 58    0.896    1.104   bcdefg   
##  Sta002   1.033 0.0518 58    0.930    1.137    cdefg   
##  Yol001   1.033 0.0518 58    0.930    1.137    cdefg   
##  Yol002   1.033 0.0518 58    0.930    1.137    cdefg   
##  Gle001   1.033 0.0518 58    0.930    1.137    cdefg   
##  SJ001    1.067 0.0518 58    0.963    1.170     defg   
##  Sta001   1.067 0.0518 58    0.963    1.170     defg   
##  CL001    1.067 0.0518 58    0.963    1.170     defg   
##  SJ002    1.067 0.0518 58    0.963    1.170     defg   
##  Yub001   1.100 0.0518 58    0.996    1.204      efg   
##  Kin001   1.133 0.0518 58    1.030    1.237       fg   
##  Yub002   1.167 0.0518 58    1.063    1.270        g   
##  C58      1.500 0.0518 58    1.396    1.604         h  
##  Tul002   1.900 0.0518 58    1.796    2.004          i 
##  Sta004   2.200 0.0518 58    2.096    2.304           j
##  But001   2.467 0.0518 58    2.363    2.570           j
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 29 estimates 
## significance level used: alpha = 0.05
  1. ciprofloxacin
cip <- read.csv("ciprofloxacin.csv", header = TRUE, sep = ",")
str(cip)
## 'data.frame':    90 obs. of  2 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ cip_D  : num  2.8 3 2.9 3 3.1 3 3 3.1 3.2 3.1 ...
# create a linear model
cipM <- lm(cip_D ~ strains, cip)

# do diagnostic of data variance
par(mfrow = c(2,2)); plot(cipM); par(mfrow = c(1,1))
## hat values (leverages) are all = 0.3333333
##  and there are no factor predictors; no plot no. 5

anova(cipM)
## Analysis of Variance Table
## 
## Response: cip_D
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## strains   29 14.1891 0.48928  34.739 < 2.2e-16 ***
## Residuals 60  0.8451 0.01408                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cip_mean <- emmeans(cipM, specs = "strains")

cld(cip_mean, alpha = 0.05, Letters = "abcdefghijklmn")
##  strains emmean     SE df lower.CL upper.CL .group          
##  Kin002    1.90 0.0685 60     1.76     2.04  a              
##  Tul002    2.57 0.0685 60     2.43     2.70   b             
##  Kin003    2.63 0.0685 60     2.50     2.77   bc            
##  Gle001    2.67 0.0685 60     2.53     2.80   bcd           
##  SJ003     2.70 0.0685 60     2.57     2.84   bcde          
##  Sta002    2.83 0.0685 60     2.70     2.97   bcdef         
##  But001    2.90 0.0685 60     2.76     3.04   bcdefg        
##  Yub002    3.00 0.0685 60     2.86     3.14    cdefg        
##  CL001     3.03 0.0685 60     2.90     3.17     defgh       
##  Sta004    3.03 0.0685 60     2.90     3.17     defgh       
##  But002    3.03 0.0685 60     2.90     3.17     defgh       
##  SJ001     3.03 0.0685 60     2.90     3.17     defgh       
##  Sut002    3.07 0.0685 60     2.93     3.20      efghi      
##  Teh001    3.10 0.0685 60     2.96     3.24       fghi      
##  CC001     3.10 0.0685 60     2.96     3.24       fghi      
##  Sta003    3.10 0.0685 60     2.96     3.24       fghi      
##  Teh002    3.13 0.0685 60     3.00     3.27       fghij     
##  Sta001    3.20 0.0685 60     3.06     3.34       fghijk    
##  Sut001    3.23 0.0685 60     3.10     3.37        ghijkl   
##  Kin001    3.40 0.0685 60     3.26     3.54         hijklm  
##  Sta005    3.43 0.0685 60     3.30     3.57          ijklm  
##  Tul001    3.43 0.0685 60     3.30     3.57          ijklm  
##  SJ002     3.43 0.0685 60     3.30     3.57          ijklm  
##  Tul003    3.50 0.0685 60     3.36     3.64           jklmna
##  Yol002    3.50 0.0685 60     3.36     3.64           jklmna
##  Tul004    3.53 0.0685 60     3.40     3.67            klmna
##  Gle002    3.60 0.0685 60     3.46     3.74             lmna
##  Yol001    3.63 0.0685 60     3.50     3.77              mna
##  Yub001    3.67 0.0685 60     3.53     3.80              mna
##  C58       3.83 0.0685 60     3.70     3.97               na
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 30 estimates 
## significance level used: alpha = 0.05
  1. erythromycin
ery <- read.csv("erythromycin.csv", header = TRUE, sep = ",")
str(ery)
## 'data.frame':    90 obs. of  2 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ e15_D  : num  1.1 1 1 0.8 0.7 0.6 1.2 1.2 1.3 1.1 ...
# create a linear model
eryM <- lm(e15_D ~ strains, ery)

# do diagnostic of data variance
par(mfrow = c(2,2)); plot(eryM); par(mfrow = c(1,1))
## hat values (leverages) are all = 0.3333333
##  and there are no factor predictors; no plot no. 5

anova(eryM)
## Analysis of Variance Table
## 
## Response: e15_D
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## strains   29 4.7779 0.16476  18.306 < 2.2e-16 ***
## Residuals 60 0.5400 0.00900                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ery_mean <- emmeans(eryM, specs = "strains")

cld(ery_mean, alpha = 0.05, Letters = "abcdefghijklmn")
##  strains emmean     SE df lower.CL upper.CL .group   
##  But002   0.700 0.0548 60    0.590    0.810  a       
##  Kin002   0.767 0.0548 60    0.657    0.876  ab      
##  Sta001   0.867 0.0548 60    0.757    0.976  abc     
##  Sta005   0.867 0.0548 60    0.757    0.976  abc     
##  Sut001   0.900 0.0548 60    0.790    1.010  abcd    
##  Gle001   0.933 0.0548 60    0.824    1.043  abcde   
##  Kin001   0.933 0.0548 60    0.824    1.043  abcde   
##  Tul003   0.967 0.0548 60    0.857    1.076  abcde   
##  SJ002    0.967 0.0548 60    0.857    1.076  abcde   
##  Gle002   0.967 0.0548 60    0.857    1.076  abcde   
##  Yol001   1.000 0.0548 60    0.890    1.110  abcde   
##  Sta003   1.000 0.0548 60    0.890    1.110  abcde   
##  But001   1.033 0.0548 60    0.924    1.143   bcde   
##  Teh001   1.067 0.0548 60    0.957    1.176   bcdef  
##  Tul004   1.067 0.0548 60    0.957    1.176   bcdef  
##  Yol002   1.067 0.0548 60    0.957    1.176   bcdef  
##  Yub002   1.067 0.0548 60    0.957    1.176   bcdef  
##  SJ001    1.067 0.0548 60    0.957    1.176   bcdef  
##  Tul001   1.100 0.0548 60    0.990    1.210    cdef  
##  Kin003   1.100 0.0548 60    0.990    1.210    cdef  
##  Sta002   1.100 0.0548 60    0.990    1.210    cdef  
##  CL001    1.133 0.0548 60    1.024    1.243    cdefg 
##  Teh002   1.133 0.0548 60    1.024    1.243    cdefg 
##  Sta004   1.167 0.0548 60    1.057    1.276    cdefg 
##  SJ003    1.200 0.0548 60    1.090    1.310     defg 
##  CC001    1.233 0.0548 60    1.124    1.343      efg 
##  Sut002   1.233 0.0548 60    1.124    1.343      efg 
##  Tul002   1.367 0.0548 60    1.257    1.476       fg 
##  C58      1.433 0.0548 60    1.324    1.543        g 
##  Yub001   2.000 0.0548 60    1.890    2.110         h
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 30 estimates 
## significance level used: alpha = 0.05
  1. rifampin resistance
rif <- read.csv("rifampin.csv", header = TRUE, sep = ",")
rif <- rif %>% filter(strains != "Sta003")
str(rif)
## 'data.frame':    87 obs. of  2 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ rif_D  : num  1.3 1.2 1.3 1.2 1.2 1.3 1.4 1.3 1.3 1.2 ...
# create a linear model
rifM <- lm(rif_D ~ strains, rif)

# do diagnostic of data variance
par(mfrow = c(2,2)); plot(rifM); par(mfrow = c(1,1))
## hat values (leverages) are all = 0.3333333
##  and there are no factor predictors; no plot no. 5

anova(rifM)
## Analysis of Variance Table
## 
## Response: rif_D
##           Df  Sum Sq  Mean Sq F value    Pr(>F)    
## strains   28 1.22684 0.043816  10.269 8.545e-14 ***
## Residuals 58 0.24747 0.004267                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
rif_mean <- emmeans(rifM, specs = "strains")

cld(rif_mean, alpha = 0.05, Letters = "abcdefghi")
##  strains emmean     SE df lower.CL upper.CL .group  
##  Sta004    1.13 0.0377 58     1.06     1.21  a      
##  Sta005    1.17 0.0377 58     1.09     1.24  ab     
##  Sut001    1.17 0.0377 58     1.09     1.24  ab     
##  Gle001    1.17 0.0377 58     1.09     1.24  ab     
##  Sta001    1.20 0.0377 58     1.12     1.28  abc    
##  But002    1.23 0.0377 58     1.16     1.31  abcd   
##  Kin002    1.23 0.0377 58     1.16     1.31  abcd   
##  Teh001    1.23 0.0377 58     1.16     1.31  abcd   
##  Tul003    1.23 0.0377 58     1.16     1.31  abcd   
##  Tul004    1.23 0.0377 58     1.16     1.31  abcd   
##  SJ001     1.25 0.0377 58     1.17     1.33  abcd   
##  But001    1.27 0.0377 58     1.19     1.34  abcd   
##  CL001     1.27 0.0377 58     1.19     1.34  abcd   
##  SJ003     1.27 0.0377 58     1.19     1.34  abcd   
##  Yol001    1.27 0.0377 58     1.19     1.34  abcd   
##  Gle002    1.30 0.0377 58     1.22     1.38  abcde  
##  Tul001    1.30 0.0377 58     1.22     1.38  abcde  
##  SJ002     1.32 0.0377 58     1.24     1.39  abcde  
##  CC001     1.33 0.0377 58     1.26     1.41  abcdef 
##  Teh002    1.33 0.0377 58     1.26     1.41  abcdef 
##  Yol002    1.33 0.0377 58     1.26     1.41  abcdef 
##  Sut002    1.37 0.0377 58     1.29     1.44   bcdef 
##  Yub001    1.40 0.0377 58     1.32     1.48    cdefg
##  Sta002    1.43 0.0377 58     1.36     1.51     defg
##  Kin001    1.43 0.0377 58     1.36     1.51     defg
##  Yub002    1.50 0.0377 58     1.42     1.58      efg
##  Tul002    1.53 0.0377 58     1.46     1.61       fg
##  Kin003    1.53 0.0377 58     1.46     1.61       fg
##  C58       1.60 0.0377 58     1.52     1.68        g
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 29 estimates 
## significance level used: alpha = 0.05
  1. tetracycline resistance
tec <- read.csv("tetracycline.csv", header = TRUE, sep = ",")
str(tec)
## 'data.frame':    90 obs. of  2 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ te30_D : num  3.4 3.5 3.3 2.3 2.2 2.3 3 2.9 3.2 3.1 ...
# create a linear model
tecM <- lm(te30_D ~ strains, tec)

# do diagnostic of data variance
par(mfrow = c(2,2)); plot(tecM); par(mfrow = c(1,1))
## hat values (leverages) are all = 0.3333333
##  and there are no factor predictors; no plot no. 5

anova(tecM)
## Analysis of Variance Table
## 
## Response: te30_D
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## strains   29 14.7130 0.50735  39.445 < 2.2e-16 ***
## Residuals 60  0.7717 0.01286                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tec_mean <- emmeans(tecM, specs = "strains")

cld(tec_mean, alpha = 0.05, Letters = "abcdefghiklmnopqrst")
##  strains emmean     SE df lower.CL upper.CL .group        
##  Sta001    2.23 0.0655 60     2.10     2.36  a            
##  But002    2.27 0.0655 60     2.14     2.40  ab           
##  Sut001    2.27 0.0655 60     2.14     2.40  ab           
##  Gle001    2.37 0.0655 60     2.24     2.50  abc          
##  Tul004    2.43 0.0655 60     2.30     2.56  abc          
##  SJ001     2.45 0.0655 60     2.32     2.58  abc          
##  Sta002    2.47 0.0655 60     2.34     2.60  abc          
##  SJ002     2.50 0.0655 60     2.37     2.63  abcd         
##  Sut002    2.50 0.0655 60     2.37     2.63  abcd         
##  Tul001    2.50 0.0655 60     2.37     2.63  abcd         
##  Teh001    2.60 0.0655 60     2.47     2.73   bcde        
##  Teh002    2.63 0.0655 60     2.50     2.76    cdef       
##  Yol001    2.70 0.0655 60     2.57     2.83    cdefg      
##  Gle002    2.70 0.0655 60     2.57     2.83    cdefg      
##  Sta003    2.83 0.0655 60     2.70     2.96     defgh     
##  Yol002    2.87 0.0655 60     2.74     3.00      efgh     
##  SJ003     2.93 0.0655 60     2.80     3.06      efghi    
##  Kin003    2.97 0.0655 60     2.84     3.10       fghi    
##  Sta005    3.03 0.0655 60     2.90     3.16        ghik   
##  CC001     3.03 0.0655 60     2.90     3.16        ghik   
##  Tul002    3.03 0.0655 60     2.90     3.16        ghik   
##  Kin001    3.07 0.0655 60     2.94     3.20         hikl  
##  CL001     3.10 0.0655 60     2.97     3.23         hikl  
##  Tul003    3.10 0.0655 60     2.97     3.23         hikl  
##  Yub002    3.27 0.0655 60     3.14     3.40          ikl  
##  Sta004    3.33 0.0655 60     3.20     3.46           klm 
##  Kin002    3.37 0.0655 60     3.24     3.50           klmn
##  But001    3.40 0.0655 60     3.27     3.53            lmn
##  C58       3.63 0.0655 60     3.50     3.76             mn
##  Yub001    3.70 0.0655 60     3.57     3.83              n
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 30 estimates 
## significance level used: alpha = 0.05
  1. streptomycin(S10) and vancomycin(VA30) No statistic analysis since only one strain is resitant to each of them. Generate table for association analysis
antibiotics <- read.csv("antibiotics.csv", header = TRUE, sep = ",")
str(antibiotics)
## 'data.frame':    90 obs. of  9 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ S10    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RA5    : num  1.3 1.2 1.3 1.2 1.2 1.3 1.4 1.3 1.3 1.2 ...
##  $ TE30   : num  3.4 3.5 3.3 2.3 2.2 2.3 3 2.9 3.2 3.1 ...
##  $ CB100  : num  2.1 1.9 2.1 2.7 2.5 2.4 2 2.2 2.1 2.1 ...
##  $ E15    : num  1.1 1 1 0.8 0.7 0.6 1.2 1.2 1.3 1.1 ...
##  $ VA30   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ CIP    : num  2.8 3 2.9 3 3.1 3 3 3.1 3.2 3.1 ...
##  $ C30    : num  2.4 2.5 2.5 0.8 1 1 1 1 0.9 1.1 ...
# save streptomycin to a csv table
s10 <- antibiotics %>% select(1,2)
#write.table(s10, file = "/Users/dklabuser/limin/data_analysis/phenotypic_data/csv/streptomycin.csv", sep = ",", row.names = FALSE)

# save vancomycin to a csv table
va30 <- antibiotics %>% select(1,7)
#write.table(va30, file = "/Users/dklabuser/limin/data_analysis/phenotypic_data/csv/vancomycin.csv", sep = ",", row.names = FALSE)

K84 test

k84 <- read.csv("k84.csv", header = TRUE, sep = ",")
str(k84)
## 'data.frame':    90 obs. of  2 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ k84_D  : num  5.32 5.32 5.31 0 0 0 4.83 4.81 4.81 0 ...
k84 <- k84 %>% filter(k84_D != 0)
# create a linear model
k84M <- lm(k84_D ~ strains, k84)

# do diagnostic of data variance
par(mfrow = c(2,2)); plot(k84M); par(mfrow = c(1,1))
## hat values (leverages) are all = 0.3333333
##  and there are no factor predictors; no plot no. 5

anova(k84M)
## Analysis of Variance Table
## 
## Response: k84_D
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## strains   14 18.9287  1.3520  108.13 < 2.2e-16 ***
## Residuals 30  0.3751  0.0125                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
k84_mean <- emmeans(k84M, specs = "strains")

cld(k84_mean, alpha = 0.05, Letters = "abcdefg")
##  strains emmean     SE df lower.CL upper.CL .group 
##  Tul003    2.87 0.0646 30     2.73     3.00  a     
##  Yub001    4.51 0.0646 30     4.38     4.64   b    
##  Sta005    4.52 0.0646 30     4.39     4.65   b    
##  CC001     4.82 0.0646 30     4.68     4.95   bc   
##  Sta003    5.11 0.0646 30     4.98     5.24    cd  
##  Tul001    5.13 0.0646 30     5.00     5.27    cde 
##  Sta001    5.28 0.0646 30     5.15     5.41     def
##  Kin003    5.29 0.0646 30     5.16     5.43     def
##  Yol001    5.31 0.0646 30     5.18     5.45     def
##  Yol002    5.31 0.0646 30     5.18     5.45     def
##  But001    5.32 0.0646 30     5.18     5.45     def
##  C58       5.34 0.0646 30     5.21     5.48     def
##  Sta004    5.41 0.0646 30     5.27     5.54     def
##  SJ002     5.45 0.0646 30     5.32     5.59      ef
##  SJ001     5.53 0.0646 30     5.39     5.66       f
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 15 estimates 
## significance level used: alpha = 0.05

motility test

mot <- read.csv("motility.csv", header = TRUE, sep = ",")
str(mot)
## 'data.frame':    90 obs. of  2 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ mot_48h: num  3.32 3.31 3.36 3.5 3.48 3.51 3.71 3.7 3.65 3.2 ...
# create a linear model
motM <- lm(mot_48h ~ strains, mot)

# do diagnostic of data variance
par(mfrow = c(2,2)); plot(motM); par(mfrow = c(1,1))
## hat values (leverages) are all = 0.3333333
##  and there are no factor predictors; no plot no. 5

anova(motM)
## Analysis of Variance Table
## 
## Response: mot_48h
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## strains   29 7.1722 0.24732  48.304 < 2.2e-16 ***
## Residuals 60 0.3072 0.00512                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mot_mean <- emmeans(motM, specs = "strains")

cld(mot_mean, alpha = 0.05, Letters = "abcdefghiklmnopqrst")
##  strains emmean     SE df lower.CL upper.CL .group         
##  CL001     3.18 0.0413 60     3.09     3.26  a             
##  Kin002    3.26 0.0413 60     3.18     3.35  ab            
##  Yub002    3.27 0.0413 60     3.19     3.36  abc           
##  But001    3.33 0.0413 60     3.25     3.41  abcd          
##  Tul003    3.39 0.0413 60     3.31     3.47  abcde         
##  Tul002    3.44 0.0413 60     3.36     3.53   bcdef        
##  Teh002    3.46 0.0413 60     3.38     3.55   bcdefg       
##  Gle002    3.49 0.0413 60     3.41     3.57   bcdefgh      
##  But002    3.50 0.0413 60     3.41     3.58    cdefgh      
##  Teh001    3.50 0.0413 60     3.42     3.58    cdefgh      
##  Kin001    3.50 0.0413 60     3.42     3.59     defgh      
##  Sut001    3.51 0.0413 60     3.43     3.59     defgh      
##  Tul001    3.51 0.0413 60     3.43     3.59     defgh      
##  Kin003    3.59 0.0413 60     3.50     3.67      efghi     
##  C58       3.59 0.0413 60     3.51     3.68      efghi     
##  Gle001    3.62 0.0413 60     3.54     3.70       fghik    
##  Sta005    3.63 0.0413 60     3.54     3.71       fghik    
##  CC001     3.69 0.0413 60     3.60     3.77        ghikl   
##  Sta003    3.71 0.0413 60     3.62     3.79         hikl   
##  Sut002    3.71 0.0413 60     3.63     3.79         hikl   
##  Tul004    3.75 0.0413 60     3.66     3.83          ikl   
##  Sta001    3.75 0.0413 60     3.67     3.84          ikl   
##  Sta002    3.76 0.0413 60     3.68     3.85          ikl   
##  SJ002     3.81 0.0413 60     3.72     3.89          ikl   
##  Yol001    3.85 0.0413 60     3.76     3.93           klm  
##  SJ001     3.88 0.0413 60     3.79     3.96            lm  
##  Yol002    4.07 0.0413 60     3.98     4.15             mn 
##  Sta004    4.11 0.0413 60     4.03     4.19              n 
##  SJ003     4.19 0.0413 60     4.11     4.28              n 
##  Yub001    4.46 0.0413 60     4.37     4.54               o
## 
## Confidence level used: 0.95 
## P value adjustment: tukey method for comparing a family of 30 estimates 
## significance level used: alpha = 0.05

GROWTH RATE ANALYSIS

tsb1 <- read.table("box3_tsb_48hrs.csv", header = FALSE, sep = ",")

# extract table only including the obsorbance.
Table <- matrix(0, 98, 146)
for(i in 2:99){
  colname <- paste("V", i, sep = "")
  newcolumn <- tsb1[40:185,colname]
  Table[i-1,] <- newcolumn
}

Table <- t(Table)
str(Table)
##  chr [1:146, 1:98] "Time" "0:29:20" "0:59:20" "1:29:20" "1:59:20" "2:29:20" ...
Table <- as.data.frame(Table)
# clean table, and rename column names
Table <- Table[,-2]
#str(Table)
namescol <- Table[1,]
colnames(Table) <- namescol
Tab1 <- Table[-1,]

# convert Time format to secons then to hours
Tab1$Time <- (period_to_seconds(hms(Tab1$Time))/60/60)

# change column name from Time to time, and column A4 as blank
names(Tab1)[names(Tab1) == "Time"] <- "time"
names(Tab1)[names(Tab1) == "H12"] <- "blank"
# create a vector storing wells where agro grows and time and blank column
bch1 <- c("time","blank","A1","A2","A3","A5","A6","A7","A9","A10","A11","C1","C2","C3","C5","C6","C7","C9","C10","C11","E1","E2","E3","E5","E6","E7","G1","G2","G3","G5","G6","G7")

# remove other extra blank columns
Tab1 <- Tab1[,colnames(Tab1) %in% bch1]
bch2 <- c("blank","A1","A2","A3","A5","A6","A7","A9","A10","A11","C1","C2","C3","C5","C6","C7","C9","C10","C11","E1","E2","E3","E5","E6","E7","G1","G2","G3","G5","G6","G7")

# time column is already double type, the following for loop change convert the rest table contents to numberic
for (i in bch2){
  Tab1[i] <- as.numeric(unlist(Tab1[i]))
}

# After the previous preparation, Tab1 one is ready for growthcurver analysis

Tab1$time <- round(Tab1$time, 2)
Tab1 <- Tab1[1:110,]
#colnames(Tab1)

# analyze the whole plate and remove the blank row
t1 <- SummarizeGrowthByPlate(Tab1, bg_correct = "blank")[-31,]
tsb_box3 <- c("C58_1","C58_2","C58_3","SJ003_1","SJ003_2","SJ003_3","Sta004_1","Sta004_2","Sta004_3","Sut002_1","Sut002_2","Sut002_3","Sta001_1","Sta001_2","Sta001_3","Sta005_1","Sta005_2","Sta005_3","SJ001_1","SJ001_2","SJ001_3","Sta002_1","Sta002_2","Sta002_3","SJ002_1","SJ002_2","SJ002_3","Sta003_1","Sta003_2","Sta003_3")
t1$sample <- tsb_box3
#write.table(t1, file = "/Users/dklabuser/limin/data_analysis/phenotypic_data/round3_growth_curveData/growthcurver_outputs/tsb_box3.csv", row.names = FALSE, col.names = TRUE, sep = ",")

The following two chunks will run the 6 csv files at one time. box1, box2, box3, 10 agro strains for each box, the order of 10 strains in both tsb and AB medium are the same.

box3 <- c("C58","C58","C58","SJ003","SJ003","SJ003","Sta004","Sta004","Sta004","Sut002","Sut002","Sut002","Sta001","Sta001","Sta001","Sta005","Sta005","Sta005","SJ001","SJ001","SJ001","Sta002","Sta002","Sta002","SJ002","SJ002","SJ002","Sta003","Sta003","Sta003")

box2 <- c("Tul001","Tul001","Tul001","Teh001","Teh001","Teh001","Yub001","Yub001","Yub001","Tul002","Tul002","Tul002","Teh002","Teh002","Teh002","Yub002","Yub002","Yub002","Tul003","Tul003","Tul003","Yol001","Yol001","Yol001","Tul004","Tul004","Tul004","Yol002","Yol002","Yol002")

box1 <- c("CC001","CC001","CC001","Sut001","Sut001","Sut001","Kin002","Kin002","Kin002","CL001","CL001","CL001","Gle001","Gle001","Gle001","Kin003","Kin003","Kin003","But001","But001","But001","Gle002","Gle002","Gle002","But002","But002","But002","Kin001","Kin001","Kin001")
bx <-list(box1=box1, box2=box2,box3=box3)
# create a vector for 6 excel tables
tabnames <- c("box1_AB_72hrs","box2_AB_72hrs","box3_AB_72hrs","box1_tsb_72hrs","box2_tsb_72hrs","box3_tsb_48hrs")
for (tab in tabnames){
  a <- unlist(strsplit(tab,"_"))[1] # a will be assigned a vector.
  c <- a # save a to a new object called c, which is used as a name when saving results.  
  b <- unlist(strsplit(tab,"_"))[2]

  filen <- paste0("/Users/dklabuser/limin/data_analysis/phenotypic_data/round3_growth_curveData/", tab, ".csv")
tsb1 <- read.table(filen, header = FALSE, sep = ",")

# prepare path to save the data
lk <- "/Users/dklabuser/limin/data_analysis/phenotypic_data/round3_growth_curveData/growthcurver_outputs/"
pathTable <- paste0(lk,c,"_",b,".csv")
pathPlot <- paste0(lk,c,"_",b,".png")

# extract table only including the absorbance.
Table <- matrix(0, 98, 146)
for(i in 2:99){
  colname <- paste("V", i, sep = "")
  newcolumn <- tsb1[40:185,colname]
  Table[i-1,] <- newcolumn
}

Table <- t(Table)
Table <- as.data.frame(Table)


# clean table, and rename column names
Table <- Table[,-2]
namescol <- Table[1,]
colnames(Table) <- namescol
Tab1 <- Table[-1,]

# convert Time format to seconds, then to hours
Tab1$Time <- (period_to_seconds(hms(Tab1$Time))/60/60)

# change column name from Time to time, and last column as blank
names(Tab1)[names(Tab1) == "Time"] <- "time"
names(Tab1)[names(Tab1) == "H12"] <- "blank"

# create a vector storing wells where agro grows and time and blank column, this is based on my exp design, I culture 10 isolate each time.
bch1 <- c("time","blank","A1","A2","A3","A5","A6","A7","A9","A10","A11","C1","C2","C3","C5","C6","C7","C9","C10","C11","E1","E2","E3","E5","E6","E7","G1","G2","G3","G5","G6","G7")

# remove other extra blank columns
Tab1 <- Tab1[,colnames(Tab1) %in% bch1]

# create bch2 vector to convert them to numneric data type, using following for loop.
bch2 <- c("blank","A1","A2","A3","A5","A6","A7","A9","A10","A11","C1","C2","C3","C5","C6","C7","C9","C10","C11","E1","E2","E3","E5","E6","E7","G1","G2","G3","G5","G6","G7")

# time column is already double type, the following for loop change convert the rest table contents to numberic
for (i in bch2){
  #print(i)
  Tab1[i] <- as.numeric(unlist(Tab1[i]))
}

# After the previous preparation, Tab1 one is ready for growthcurve analysis

Tab1$time <- round(Tab1$time, 2)
Tab1 <- Tab1[1:110,]

# analyze the whole plate and remove the blank row
t1 <- SummarizeGrowthByPlate(Tab1, bg_correct = "blank")[-31,]
SummarizeGrowthByPlate(Tab1, plot_fit = TRUE, plot_file = pathPlot) # save to pdf failed.

# this loop is to assign real sample names to vector a created in the very beginning.
for (i in 1:3){
  nm <- names(bx)[i]
  ifelse (a == nm,a <- unlist(bx[i]), print("KO")) # if statement in r needs to specify both true and false condition
}

# rename sample column using real sample name created from for loop, this column actually becomes a row names column.
t1$sample <- a

# save the result dataframe to a table
#write.table(t1, file = pathTable, row.names = FALSE, col.names = TRUE, sep = ",")
}
## Warning in grSoftVersion(): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
##   Referenced from: /Library/Frameworks/R.framework/Versions/4.0/Resources/modules/R_X11.so
##   Reason: image not found
## Warning in cairoVersion(): unable to load shared object '/Library/Frameworks/R.framework/Resources/library/grDevices/libs//cairo.so':
##   dlopen(/Library/Frameworks/R.framework/Resources/library/grDevices/libs//cairo.so, 6): Library not loaded: /opt/X11/lib/libXrender.1.dylib
##   Referenced from: /Library/Frameworks/R.framework/Versions/4.0/Resources/library/grDevices/libs/cairo.so
##   Reason: image not found
## Warning in grDevices::cairo_pdf(plot_file, width = 12, height = 8): failed to
## load cairo DLL
## [1] "KO"
## [1] "KO"
## Warning in grDevices::cairo_pdf(plot_file, width = 12, height = 8): failed to
## load cairo DLL
## [1] "KO"
## [1] "KO"
## Warning in grDevices::cairo_pdf(plot_file, width = 12, height = 8): failed to
## load cairo DLL
## [1] "KO"
## [1] "KO"
## Warning in grDevices::cairo_pdf(plot_file, width = 12, height = 8): failed to
## load cairo DLL
## [1] "KO"
## [1] "KO"
## Warning in grDevices::cairo_pdf(plot_file, width = 12, height = 8): failed to
## load cairo DLL
## [1] "KO"
## [1] "KO"
## Warning in grDevices::cairo_pdf(plot_file, width = 12, height = 8): failed to
## load cairo DLL
## [1] "KO"
## [1] "KO"

the above code able to do the analysis, however, failed to generate pdf plot, and only 5 graphs can be printed in R.

plot growth rate and doubling time. First merge result tables from the same medium

# prepare 3 data frames of tsb medium
b1tsb <- read.table("box1_tsb.csv", header = TRUE, sep = ",")
b2tsb <- read.table("box2_tsb.csv", header = TRUE, sep = ",")
b3tsb <- read.table("box3_tsb.csv", header = TRUE, sep = ",")

b12 <- rbind(b1tsb,b2tsb)
b123 <- rbind(b12, b3tsb)
str(b123)
## 'data.frame':    90 obs. of  10 variables:
##  $ sample: chr  "CC001" "CC001" "CC001" "Sut001" ...
##  $ k     : num  0.825 0.399 0.427 0.641 0.554 ...
##  $ n0    : num  0.02451 0.00043 0.00272 0.0012 0.0021 ...
##  $ r     : num  0.157 0.433 0.307 0.339 0.3 ...
##  $ t_mid : num  22.2 15.8 16.4 18.5 18.6 ...
##  $ t_gen : num  4.42 1.6 2.26 2.04 2.31 ...
##  $ auc_l : num  26.9 15.6 16.5 23.4 20.2 ...
##  $ auc_e : num  26.4 15.6 16.3 23.4 20.1 ...
##  $ sigma : num  0.0608 0.00865 0.03089 0.0399 0.0407 ...
##  $ note  : logi  NA NA NA NA NA NA ...
ggplot(b123, aes(x=sample, y= r, color = sample)) +
  geom_boxplot()+
  geom_jitter() +
  ggtitle("growth rate of 30 agro in TSB medium") +
  theme(axis.text.x=element_text(hjust=0.5,vjust=0.5,angle=90)) 

#ggsave(filename = "/Users/dklabuser/limin/data_analysis/phenotypic_data/round3_growth_curveData/growthcurver_outputs/tsb_GR.png", units = "cm", dpi = 3000)

ggplot(b123, aes(x=sample, y= t_gen, color = sample)) +
  geom_boxplot()+
  geom_jitter() +
  ggtitle("double time of 30 agro in TSB medium") +
  theme(axis.text.x=element_text(hjust=0.5,vjust=0.5,angle=90)) 

#ggsave(filename = "/Users/dklabuser/limin/data_analysis/phenotypic_data/round3_growth_curveData/growthcurver_outputs/tsb_dt.png", units = "cm", dpi = 3000)

# prepare 3 data frames of AB medium
b1AB <- read.table("box1_AB.csv", header = TRUE, sep = ",")
b2AB <- read.table("box2_AB.csv", header = TRUE, sep = ",")
b3AB <- read.table("box3_AB.csv", header = TRUE, sep = ",")

b12AB <- rbind(b1AB,b2AB)
b123AB <- rbind(b12AB, b3AB)
str(b123AB)
## 'data.frame':    90 obs. of  10 variables:
##  $ sample: chr  "CC001" "CC001" "CC001" "Sut001" ...
##  $ k     : num  0.184 0.23 0.237 0.341 0.321 ...
##  $ n0    : num  1.91e-06 9.24e-05 1.30e-04 2.92e-04 1.66e-04 ...
##  $ r     : num  0.492 0.325 0.311 0.292 0.326 ...
##  $ t_mid : num  23.3 24.1 24.1 24.2 23.2 ...
##  $ t_gen : num  1.41 2.14 2.23 2.37 2.13 ...
##  $ auc_l : num  5.83 7.1 7.32 10.51 10.2 ...
##  $ auc_e : num  5.8 7.06 7.29 10.47 10.17 ...
##  $ sigma : num  0.0112 0.0126 0.0124 0.0118 0.0108 ...
##  $ note  : logi  NA NA NA NA NA NA ...
ggplot(b123AB, aes(x=sample, y= r, color = sample)) +
  geom_boxplot()+
  geom_jitter() +
   ggtitle("growth rate of 30 agro in AB medium") +
  theme(axis.text.x=element_text(hjust=0.5,vjust=0.5,angle=90)) 

#ggsave(filename = "/Users/dklabuser/limin/data_analysis/phenotypic_data/round3_growth_curveData/growthcurver_outputs/AB_GR.png", units = "cm", dpi = 3000)
ggplot(b123AB, aes(x=sample, y= t_gen, color = sample)) +
  geom_boxplot()+
  geom_jitter() +
  ggtitle("double time of 30 agro in AB medium") +
  theme(axis.text.x=element_text(hjust=0.5,vjust=0.5,angle=90)) 

#ggsave(filename = "/Users/dklabuser/limin/data_analysis/phenotypic_data/round3_growth_curveData/growthcurver_outputs/AB_dt.png", units = "cm", dpi = 3000)

PHENOTYPIC DATA CORRELATION ANALYSIS ################################################## prepare data for correlation analysis. Extract mean of each phenotypic data and save as a dataframe, then scatter plot two of them to see if there is linearity between two traits.

#virulence
datura <- read.table("datura_tumor.csv", header = TRUE, sep = ",")
str(datura)
## 'data.frame':    465 obs. of  4 variables:
##  $ strains: chr  "But001" "But001" "But001" "But001" ...
##  $ weight : num  4.86 2.75 2.41 2.53 5.87 ...
##  $ block  : int  1 1 1 1 1 2 2 2 2 2 ...
##  $ line   : int  7 7 7 7 7 32 32 32 32 32 ...
od <- rep(1:15, 31)
datura <- datura %>% select(1,2) %>% cbind(od) 
datura_wide <- spread(datura,strains,weight) 
datura_wide <- datura_wide[,-1] # remove seq col
datura_wide <- datura_wide[,-11] # remove Mock col
tumor_means <- apply(datura_wide, 2, mean) # calculate mean for each isolate
df1 <- as.data.frame(tumor_means) # convert tumor weight means to a dataframe

# motility
mot <- read.csv("motility.csv", header = TRUE, sep = ",")
a <- rep(1:3,30)
mot <- cbind(mot, a)
str(mot)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ mot_48h: num  3.32 3.31 3.36 3.5 3.48 3.51 3.71 3.7 3.65 3.2 ...
##  $ a      : int  1 2 3 1 2 3 1 2 3 1 ...
mot_wide <- spread(mot,strains,mot_48h)[,-1]
mot_mean <- apply(mot_wide, 2, mean)
df2 <- as.data.frame(mot_mean)

# carbenicillin
cb <- read.csv("carbenicillin.csv", header = TRUE, sep = ",")
a <- rep(1:3,30)
cb <- cbind(cb, a)
str(cb)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ cb100_D: num  2.1 1.9 2.1 2.7 2.5 2.4 2 2.2 2.1 2.1 ...
##  $ a      : int  1 2 3 1 2 3 1 2 3 1 ...
cb_wide <- spread(cb,strains,cb100_D)[,-1]
cb_mean <- apply(cb_wide, 2, mean)
df3 <- as.data.frame(cb_mean)

# chloramphenicol
chl <- read.csv("chloramphenicol.csv", header = TRUE, sep = ",")
a <- rep(1:3,30)
chl <- cbind(chl, a)
str(chl)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ c30_D  : num  2.4 2.5 2.5 0.8 1 1 1 1 0.9 1.1 ...
##  $ a      : int  1 2 3 1 2 3 1 2 3 1 ...
chl_wide <- spread(chl,strains,c30_D)[,-1]
chl_mean <- apply(chl_wide, 2, mean)
df4 <- as.data.frame(chl_mean)

# ciprofloxacin
cip <- read.csv("ciprofloxacin.csv", header = TRUE, sep = ",")
a <- rep(1:3,30)
cip <- cbind(cip, a)
str(cip)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ cip_D  : num  2.8 3 2.9 3 3.1 3 3 3.1 3.2 3.1 ...
##  $ a      : int  1 2 3 1 2 3 1 2 3 1 ...
cip_wide <- spread(cip,strains,cip_D)[,-1]
cip_mean <- apply(cip_wide, 2, mean)
df5 <- as.data.frame(cip_mean)

# erythromycin
ery <- read.csv("erythromycin.csv", header = TRUE, sep = ",")
a <- rep(1:3,30)
ery <- cbind(ery, a)
str(ery)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ e15_D  : num  1.1 1 1 0.8 0.7 0.6 1.2 1.2 1.3 1.1 ...
##  $ a      : int  1 2 3 1 2 3 1 2 3 1 ...
ery_wide <- spread(ery,strains,e15_D)[,-1]
ery_mean <- apply(ery_wide, 2, mean)
df6 <- as.data.frame(ery_mean)

# rifampin
rif <- read.csv("rifampin.csv", header = TRUE, sep = ",")
a <- rep(1:3,30)
rif <- cbind(rif, a)
str(rif)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ rif_D  : num  1.3 1.2 1.3 1.2 1.2 1.3 1.4 1.3 1.3 1.2 ...
##  $ a      : int  1 2 3 1 2 3 1 2 3 1 ...
rif_wide <- spread(rif,strains,rif_D)[,-1]
rif_mean <- apply(rif_wide, 2, mean)
df7 <- as.data.frame(rif_mean)

# tetracycline
tet <- read.csv("tetracycline.csv", header = TRUE, sep = ",")
a <- rep(1:3,30)
tet <- cbind(tet, a)
str(tet)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ te30_D : num  3.4 3.5 3.3 2.3 2.2 2.3 3 2.9 3.2 3.1 ...
##  $ a      : int  1 2 3 1 2 3 1 2 3 1 ...
tet_wide <- spread(tet,strains,te30_D)[,-1]
tet_mean <- apply(tet_wide, 2, mean)
df8 <- as.data.frame(tet_mean)

# k84
k84 <- read.csv("k84.csv", header = TRUE, sep = ",")
a <- rep(1:3,30)
k84 <- cbind(k84, a)
str(k84)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ k84_D  : num  5.32 5.32 5.31 0 0 0 4.83 4.81 4.81 0 ...
##  $ a      : int  1 2 3 1 2 3 1 2 3 1 ...
k84_wide <- spread(k84,strains,k84_D)[,-1]
k84_mean <- apply(k84_wide, 2, mean)
df9 <- as.data.frame(k84_mean)

# growth rate in TSB medium
df_tsb <- b123 %>% select(sample,r)
a <- rep(1:3,30)
df_tsb <- cbind(df_tsb,a)
df_tsb <- spread(df_tsb,sample,r)[,-1]
tsb_mean <- apply(df_tsb, 2, mean)
df10 <- as.data.frame(tsb_mean)

# growthrate in AB medium
df_ab <- b123AB %>% select(sample,r)
a <- rep(1:3,30)
df_ab <- cbind(df_ab,a)
df_ab <- spread(df_ab,sample,r)[,-1]
ab_mean <- apply(df_ab, 2, mean)
df11 <- as.data.frame(ab_mean)

data <- data.frame(df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11)
pairs(data) # this plot the scatterplot

cor(data, y=NULL, use = "everything",method = "pearson") # this calculate the correlation coefficient.
##              tumor_means    mot_mean      cb_mean    chl_mean     cip_mean
## tumor_means  1.000000000  0.23328349 -0.001211271  0.10545578  0.624594285
## mot_mean     0.233283485  1.00000000 -0.378131100  0.08622513  0.282789088
## cb_mean     -0.001211271 -0.37813110  1.000000000  0.04905959  0.159726609
## chl_mean     0.105455783  0.08622513  0.049059587  1.00000000  0.129254443
## cip_mean     0.624594285  0.28278909  0.159726609  0.12925444  1.000000000
## ery_mean     0.063868511  0.50268607 -0.498628993  0.28420663  0.247892920
## rif_mean    -0.173951226 -0.06792575 -0.189566297  0.14371371  0.037897170
## tet_mean    -0.023505754  0.03411365 -0.147679681  0.30934431 -0.007402748
## k84_mean     0.630016963  0.39820249  0.094606118  0.27968414  0.355152649
## tsb_mean     0.016747457  0.24116355 -0.196180138  0.26651058  0.396053474
## ab_mean     -0.105901230 -0.18609084  0.378483941 -0.16637148  0.112979675
##                ery_mean    rif_mean     tet_mean    k84_mean    tsb_mean
## tumor_means  0.06386851 -0.17395123 -0.023505754  0.63001696  0.01674746
## mot_mean     0.50268607 -0.06792575  0.034113651  0.39820249  0.24116355
## cb_mean     -0.49862899 -0.18956630 -0.147679681  0.09460612 -0.19618014
## chl_mean     0.28420663  0.14371371  0.309344311  0.27968414  0.26651058
## cip_mean     0.24789292  0.03789717 -0.007402748  0.35515265  0.39605347
## ery_mean     1.00000000  0.27714443  0.518291458  0.17650719  0.29697600
## rif_mean     0.27714443  1.00000000  0.176496514 -0.16703135  0.31978656
## tet_mean     0.51829146  0.17649651  1.000000000  0.23195963  0.19962885
## k84_mean     0.17650719 -0.16703135  0.231959628  1.00000000 -0.02683117
## tsb_mean     0.29697600  0.31978656  0.199628849 -0.02683117  1.00000000
## ab_mean      0.05392768  0.23218571  0.076060884 -0.01817376 -0.13846775
##                 ab_mean
## tumor_means -0.10590123
## mot_mean    -0.18609084
## cb_mean      0.37848394
## chl_mean    -0.16637148
## cip_mean     0.11297967
## ery_mean     0.05392768
## rif_mean     0.23218571
## tet_mean     0.07606088
## k84_mean    -0.01817376
## tsb_mean    -0.13846775
## ab_mean      1.00000000

Means of csv files. They contain 6 antibiotics, motility, k84, growth rate in TSB and AB medium, virulence. These data can also be used for preparing inputs for association analysis DBGWAS. The following generate mean of csv files of two antibiotics s10, va30, which are not included in above codes

s10 va30

test <- va30 # s10 and va30; change three place in the code

# convert each table from long to wide
test$uni <- c(rep(1:3,10))
str(test)
## 'data.frame':    90 obs. of  3 variables:
##  $ strains: chr  "But001" "But001" "But001" "But002" ...
##  $ VA30   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ uni    : int  1 2 3 1 2 3 1 2 3 1 ...
new <- spread(test,strains, VA30) %>% select(2:31) # delete the column added as unique index; change VA30

# calculate avg for each column
davg <- apply(new, 2, mean)
new2 <- rbind(new,davg) 

new3 <- new2[-c(1:3), ] # keep avg 
new3 <- data.frame(t(new3))

# save average value to a csv file for input preparation of association analysis
#write.table(new3, file = "/Users/dklabuser/limin/data_analysis/associationAnalysis/K-mer_association/cDBGWAS/agro30/strains30/va30.csv", row.names = TRUE, col.name = FALSE,sep = ",") # change save file each time.

Principal components analysis http://factominer.free.fr/factomethods/principal-components-analysis.html

#install.packages("FactoMineR")

Run PCA analysis of all phenotypic traits to see which are the most important pheno traits

# virulence- walnut tumor; 
walnut_tumor <- read.table("walnut_tumor.csv", header = TRUE, sep = ",")
walnut_tumor <- walnut_tumor[order(walnut_tumor$strains),] # reorder df alphabetically 
od <- rep(1:6, 31)
walnutumor <- walnut_tumor %>% cbind(od)
walnut_wide <- spread(walnutumor, strains, weight)
walnut_wide <- walnut_wide[,-1]
walnut_wide <- walnut_wide[,-11]
walnutumor_means <- apply(walnut_wide, 2, mean)
df0 <- as.data.frame(walnutumor_means)

# In total 12 pheno trains were includes: datura tumor, walnut tumor, mot, k84, AB, TSB, 6 antibiotic resistance data
data <- data.frame(df0,df1,df2,df3,df4,df5,df6,df7,df8,df9,df10,df11)
pairs(data) # plot the scatterplot

cor(data, y=NULL, use = "everything",method = "pearson") # calculate the correlation coefficient.
##                  walnutumor_means  tumor_means    mot_mean      cb_mean
## walnutumor_means       1.00000000  0.314189468  0.01885661  0.073969694
## tumor_means            0.31418947  1.000000000  0.23328349 -0.001211271
## mot_mean               0.01885661  0.233283485  1.00000000 -0.378131100
## cb_mean                0.07396969 -0.001211271 -0.37813110  1.000000000
## chl_mean              -0.09112783  0.105455783  0.08622513  0.049059587
## cip_mean               0.13225305  0.624594285  0.28278909  0.159726609
## ery_mean              -0.10328689  0.063868511  0.50268607 -0.498628993
## rif_mean              -0.03243981 -0.173951226 -0.06792575 -0.189566297
## tet_mean              -0.39370056 -0.023505754  0.03411365 -0.147679681
## k84_mean              -0.11143967  0.630016963  0.39820249  0.094606118
## tsb_mean              -0.08788979  0.016747457  0.24116355 -0.196180138
## ab_mean                0.03689631 -0.105901230 -0.18609084  0.378483941
##                     chl_mean     cip_mean    ery_mean    rif_mean     tet_mean
## walnutumor_means -0.09112783  0.132253046 -0.10328689 -0.03243981 -0.393700559
## tumor_means       0.10545578  0.624594285  0.06386851 -0.17395123 -0.023505754
## mot_mean          0.08622513  0.282789088  0.50268607 -0.06792575  0.034113651
## cb_mean           0.04905959  0.159726609 -0.49862899 -0.18956630 -0.147679681
## chl_mean          1.00000000  0.129254443  0.28420663  0.14371371  0.309344311
## cip_mean          0.12925444  1.000000000  0.24789292  0.03789717 -0.007402748
## ery_mean          0.28420663  0.247892920  1.00000000  0.27714443  0.518291458
## rif_mean          0.14371371  0.037897170  0.27714443  1.00000000  0.176496514
## tet_mean          0.30934431 -0.007402748  0.51829146  0.17649651  1.000000000
## k84_mean          0.27968414  0.355152649  0.17650719 -0.16703135  0.231959628
## tsb_mean          0.26651058  0.396053474  0.29697600  0.31978656  0.199628849
## ab_mean          -0.16637148  0.112979675  0.05392768  0.23218571  0.076060884
##                     k84_mean    tsb_mean     ab_mean
## walnutumor_means -0.11143967 -0.08788979  0.03689631
## tumor_means       0.63001696  0.01674746 -0.10590123
## mot_mean          0.39820249  0.24116355 -0.18609084
## cb_mean           0.09460612 -0.19618014  0.37848394
## chl_mean          0.27968414  0.26651058 -0.16637148
## cip_mean          0.35515265  0.39605347  0.11297967
## ery_mean          0.17650719  0.29697600  0.05392768
## rif_mean         -0.16703135  0.31978656  0.23218571
## tet_mean          0.23195963  0.19962885  0.07606088
## k84_mean          1.00000000 -0.02683117 -0.01817376
## tsb_mean         -0.02683117  1.00000000 -0.13846775
## ab_mean          -0.01817376 -0.13846775  1.00000000
res.pca = PCA(data, scale.unit=TRUE, ncp=5, graph=T)

align phenotypic traits with pTi phylogeny trees. Here I include df0=walnut_tumor, df1=datura_tumor, df9=k84, df10=tsb, df11=ab. Because only 27 strains have pTi, so the non-virulent strains SJ003, But002, Tul002 were exluded from the above data frames.

# walnut tumor
name <- rownames(df0)
d0 <- data.frame(name)
dfvt <- data.frame(d0,df0)
dfwt <- dfvt %>% filter(!(name %in% c("But002", "SJ003", "Tul002")))

# datura tumor
ndf1 <- rownames(df1)
d1 <- data.frame(ndf1)
dfdatura <- data.frame(d1,df1)
dfdt <- dfdatura %>% filter(!(ndf1%in% c("But002", "SJ003", "Tul002")))

# dfnew <- data.frame(dfwt,dfdt)
# write.csv(dfnew, "/Users/dklabuser/limin/data_analysis/phenotypic_data/csv/opines_tumors.csv", row.names = FALSE)

# k84 test
nk84 <- rownames(df9)
d3 <- data.frame(nk84)
df84 <- data.frame(d3, df9)
dfk84 <- df84 %>% filter(!(nk84 %in% c("But002", "SJ003", "Tul002")))

# tsb
ntsb <- rownames(df10)
d4 <- data.frame(ntsb)
dftb <- data.frame(d4, df10)
dftsb <- dftb %>% filter(!(ntsb %in% c("But002", "SJ003", "Tul002")))

# ab
nab <- rownames(df11)
d5 <- data.frame(nab)
dfa <- data.frame(d5, df11)
dfab <- dfa %>% filter(!(nab %in% c("But002", "SJ003", "Tul002")))

check number of T6SS effectors with phenotypic traits Here I include df0=walnut_tumor, df1=datura_tumor, df9=k84, df10=tsb, df11=ab

strain = c("C58","But001","But002","CL001","Kin001","Kin002","Kin003","SJ003","Sta001","Sta004","Sta005","Tul002","Yol001","Yub002","Teh002")
num = c(17,17,11,14,13,19,13,11,10,16,15,11,11,16,11)

dft6es <- data.frame(strain, num)
rn <- dft6es[,1]
dt6ss <- data.frame(dft6es[,2])
row.names(dt6ss) <- rn


# walnut tumor
name <- rownames(df0)
#d0 <- data.frame(name)
#dfvt <- data.frame(d0,df0)
dfwt_t6s <- df0 %>% filter(name %in% c("C58","But001","But002","CL001","Kin001","Kin002","Kin003","SJ003","Sta001","Sta004","Sta005","Tul002","Yol001","Yub002","Teh002"))


# datura tumor
ndf1 <- rownames(df1)
#d1 <- data.frame(ndf1)
#dfdatura <- data.frame(d1,df1)
dfdt_t6s <- df1 %>% filter(ndf1%in% c("C58","But001","But002","CL001","Kin001","Kin002","Kin003","SJ003","Sta001","Sta004","Sta005","Tul002","Yol001","Yub002","Teh002"))
str(dfdt)
## 'data.frame':    27 obs. of  2 variables:
##  $ ndf1       : chr  "But001" "C58" "CC001" "CL001" ...
##  $ tumor_means: num  2.75 1.48 2.78 2.01 1.12 ...
# k84 test
nk84 <- rownames(df9)
#d3 <- data.frame(nk84)
#df84 <- data.frame(d3, df9)
dfk84_t6s <- df9 %>% filter(nk84 %in% c("C58","But001","But002","CL001","Kin001","Kin002","Kin003","SJ003","Sta001","Sta004","Sta005","Tul002","Yol001","Yub002","Teh002"))

# tsb
ntsb <- rownames(df10)
#d4 <- data.frame(ntsb)
#dftb <- data.frame(d4, df10)
dftsb_t6s <- df10 %>% filter(ntsb %in% c("C58","But001","But002","CL001","Kin001","Kin002","Kin003","SJ003","Sta001","Sta004","Sta005","Tul002","Yol001","Yub002","Teh002"))

# ab
nab <- rownames(df11)
#d5 <- data.frame(nab)
#dfa <- data.frame(d5, df11)
dfab_t6s <- df11 %>% filter(nab %in% c("C58","But001","But002","CL001","Kin001","Kin002","Kin003","SJ003","Sta001","Sta004","Sta005","Tul002","Yol001","Yub002","Teh002"))


data <- data.frame(dt6ss,dfwt_t6s,dfdt_t6s,dfk84_t6s,dftsb_t6s,dfab_t6s)
pairs(data) # this plot the scatterplot

cor(data, y=NULL, use = "everything",method = "pearson") # this calculate the correlation coefficient.
##                  dft6es...2. walnutumor_means  tumor_means   k84_mean
## dft6es...2.        1.0000000       -0.1236971  0.100580051 0.10815768
## walnutumor_means  -0.1236971        1.0000000  0.674968554 0.37313326
## tumor_means        0.1005801        0.6749686  1.000000000 0.71347928
## k84_mean           0.1081577        0.3731333  0.713479284 1.00000000
## tsb_mean           0.1370875        0.1007635 -0.004739094 0.06174943
## ab_mean           -0.4835712        0.1706941 -0.085508181 0.03592637
##                      tsb_mean     ab_mean
## dft6es...2.       0.137087543 -0.48357115
## walnutumor_means  0.100763521  0.17069413
## tumor_means      -0.004739094 -0.08550818
## k84_mean          0.061749429  0.03592637
## tsb_mean          1.000000000 -0.12295914
## ab_mean          -0.122959139  1.00000000
ggsave("/Users/dklabuser/limin/data_analysis/T6SS/t6ss_walnut_agro/PCA_t6ss_pheno/cor_t6ss_pheno.png")
## Saving 7 x 5 in image
res.pca = PCA(data, scale.unit=TRUE, ncp=5, graph=T)

ggsave("/Users/dklabuser/limin/data_analysis/T6SS/t6ss_walnut_agro/PCA_t6ss_pheno/pcat6ss_pheno.png")
## Saving 7 x 5 in image

pTi phylogeney tree

tre <- read.tree("pTi_accessory_binary_genes.fa.newick")
n <- MRCA(tre, c("Tul003", "Teh002"))
tre <- root(tre,node = n)
tipName <- tre$tip.label

p <- ggtree(tre, layout = "rectangular") +
  geom_tiplab(size=2) 
  #geom_tippoint(size=1.5)

# rotate the tree for T-DNA right border alignment.
p <- flip(p, 6, 34) %>% flip(19,32)
p

nt_sequence <- "RB_aln.fasta"
x <- readDNAStringSet(nt_sequence)
data1 = tidy_msa(nt_sequence)
p1 <- p + geom_facet(geom = geom_msa, data = data1, panel = "T-DNA_RB", font = NULL, color = "Chemistry_NT") +
  xlim_tree(2)  

p2 <- facet_plot(p1, panel = "walnut", data = dfwt, geom = geom_segment, aes(x=0, xend=walnutumor_means, y=y, yend=y), size=3, color="red") %>%
  facet_plot(panel = "datura_tumor", data = dfdt, geom = geom_segment,aes(x=0, xend=tumor_means, y=y, yend=y),size=3,color="pink") %>% 
  facet_plot(panel = "datura_tumor", data = dfdt, geom = geom_segment,aes(x=0, xend=tumor_means, y=y, yend=y),size=3,color="magenta") %>%
  facet_plot(panel = "K84_sensitivity", data = dfk84, geom = geom_segment,aes(x=0, xend=k84_mean, y=y, yend=y),size=3,color="blue") %>%
  facet_plot(panel = "TSB_growthRate", data = dftsb, geom = geom_segment,aes(x=0, xend=tsb_mean, y=y, yend=y),size=3,color="magenta") %>%
  facet_plot(panel = "AB_growthRate", data = dfab, geom = geom_segment,aes(x=0, xend=ab_mean, y=y, yend=y),size=3,color="blue")

p2

ggsave("/Users/dklabuser/limin/Dissertation_papers/dissertation/chapter4_genetic_diversity/RB_pheno.png")
## Saving 7 x 5 in image

Perform t-test on two samples

df_tumors <- read.csv("opines_tumors.csv")
str(df_tumors)
## 'data.frame':    23 obs. of  4 variables:
##  $ name  : chr  "But001" "CC001" "Gle001" "Gle002" ...
##  $ walnut: num  3.96 2.45 3.06 3.91 2.36 ...
##  $ datura: num  2.75 2.78 1.12 2.6 1.46 ...
##  $ group : chr  "sus" "sus" "ags" "ags" ...
# evaluate if equal variance
var.test(df_tumors$walnut~ df_tumors$group)
## 
##  F test to compare two variances
## 
## data:  df_tumors$walnut by df_tumors$group
## F = 7.2965, num df = 10, denom df = 11, p-value = 0.002871
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##   2.069524 26.740905
## sample estimates:
## ratio of variances 
##           7.296462
# t-test
t.test(df_tumors$walnut~ df_tumors$group, var.equal=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  df_tumors$walnut by df_tumors$group
## t = 2.0785, df = 12.491, p-value = 0.05889
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1310119  6.1237720
## sample estimates:
## mean in group ags mean in group sus 
##          6.873727          3.877347
#t.test(df_tumors$walnut~ df_tumors$group, var.equal=TRUE)


# evaluate if true variance
var.test(df_tumors$datura~ df_tumors$group)
## 
##  F test to compare two variances
## 
## data:  df_tumors$datura by df_tumors$group
## F = 0.76724, num df = 10, denom df = 11, p-value = 0.6838
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.2176158 2.8118753
## sample estimates:
## ratio of variances 
##          0.7672418
# t-test
#t.test(df_tumors$datura~ df_tumors$group, var.equal=FALSE)
t.test(df_tumors$datura~ df_tumors$group, var.equal=TRUE)
## 
##  Two Sample t-test
## 
## data:  df_tumors$datura by df_tumors$group
## t = -3.0513, df = 21, p-value = 0.006066
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.5134349 -0.2866065
## sample estimates:
## mean in group ags mean in group sus 
##          1.852618          2.752639
ggplot(df_tumors, aes(x=group, y= walnut)) +
  geom_boxplot() 

  #geom_point()
ggsave("/Users/dklabuser/limin/Dissertation_papers/dissertation/chapter4_genetic_diversity/walnut_opine.png")
## Saving 7 x 5 in image
ggplot(df_tumors, aes(x=group, y= datura)) +
  geom_boxplot(outlier.shape = NA) +  # outlier.shape = NA to hide the outlier display
  #geom_point() +
  geom_text(x=2, y=4,label = "*", size=10, color="red") # add significant star

ggsave("/Users/dklabuser/limin/Dissertation_papers/dissertation/chapter4_genetic_diversity/datura_opine.png")
## Saving 7 x 5 in image

remove outliers

df_tumors <- read.csv("opines_tumors.csv")
ggplot(df_tumors, aes(x=group, y= datura)) +
  geom_boxplot(outlier.shape = NA) +
  #geom_point() +
  geom_text(x=2, y=4,label = "*", size=10, color="red")

# t-test to check walnut and datura has significant difference on walnut tumor size caused by the same opine type.
df_opine <- read.csv("plants_opine_tumor.csv")

# agropine
df_ags <- df_opine[1:22,1:2]
# check if equal variance
var.test(df_ags$ags~df_ags$host1)
## 
##  F test to compare two variances
## 
## data:  df_ags$ags by df_ags$host1
## F = 0.021215, num df = 10, denom df = 10, p-value = 9.093e-07
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.005707794 0.078850563
## sample estimates:
## ratio of variances 
##         0.02121468
t.test(df_ags$ags~df_ags$host1, var.equal=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  df_ags$ags by df_ags$host1
## t = -3.6567, df = 10.424, p-value = 0.004122
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.063858 -1.978360
## sample estimates:
## mean in group datura mean in group walnut 
##             1.852618             6.873727
ggplot(df_ags, aes(x=host1, y= ags)) +
  geom_boxplot(outlier.shape = NA) +
    geom_text(x=1, y=6,label = "*", size=10, color="red") # add significant star

  #geom_point()
ggsave("/Users/dklabuser/limin/Dissertation_papers/dissertation/chapter4_genetic_diversity/ags_plant.png")
## Saving 7 x 5 in image
# succinamopine
df_sus <- df_opine[,3:4]
var.test(df_sus$sus~df_sus$host2)
## 
##  F test to compare two variances
## 
## data:  df_sus$sus by df_sus$host2
## F = 0.20175, num df = 11, denom df = 11, p-value = 0.01328
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.05807971 0.70082374
## sample estimates:
## ratio of variances 
##          0.2017514
t.test(df_sus$sus~df_sus$host2, var.equal=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  df_sus$sus by df_sus$host2
## t = -2.1302, df = 15.265, p-value = 0.04981
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.248357999 -0.001058667
## sample estimates:
## mean in group datura mean in group walnut 
##             2.752639             3.877347
ggplot(df_sus, aes(x=host2, y= sus)) +
  geom_boxplot(outlier.shape = NA) +  # outlier.shape = NA to hide the outlier display
  #geom_point() +
  geom_text(x=1, y=4,label = "*", size=10, color="red") # add significant star

ggsave("/Users/dklabuser/limin/Dissertation_papers/dissertation/chapter4_genetic_diversity/sus_plant.png")
## Saving 7 x 5 in image
tre30 <- read.tree("30Agro_coreGen.tre")
tree_rooted <- root.phylo(tre30, outgroup = "Yub001")
tipName <- tre$tip.label
tree_rooted
## 
## Phylogenetic tree with 30 tips and 28 internal nodes.
## 
## Tip labels:
##   SJ001, SJ002, Sta003, Sta001, Tul001, Sut001, ...
## Node labels:
##   1.000, , 1.000, 0.982, 0.937, 1.000, ...
## 
## Unrooted; includes branch lengths.
p30 <- ggtree(tree_rooted, layout = "rectangular") +
  geom_tiplab(size=2) 
p30

p1 <- facet_plot(p30+xlim_tree(0.3), panel = "walnut_tumor", data = dfvt, geom = geom_segment, aes(x=0, xend=walnutumor_means,y=y, yend=y),size=3,color="blue") %>% 
  facet_plot(panel = "datura_tumor", data = dfdatura, geom = geom_segment,aes(x=0, xend=tumor_means, y=y, yend=y),size=3,color="red") %>%
  #facet_plot(panel = "K84_sensitivity", data = df84, geom = geom_segment,aes(x=0, xend=k84_mean, y=y, yend=y),size=3,color="blue") %>% 
  facet_plot(panel = "TSB_growthRate", data = dftb, geom = geom_segment,aes(x=0, xend=tsb_mean, y=y, yend=y),size=3,color="red") %>% 
  facet_plot(panel = "AB_growthRate", data = dfa, geom = geom_segment,aes(x=0, xend=ab_mean, y=y, yend=y),size=3,color="blue")
p1